Built with R 4.2.1


This example explores the PatentsView bulk tables, with a focus on assigned inventor sex.

Setup

Before starting, we’ll need to load the package, and point to a directory where we’d like things saved:

# install if needed: remotes::install_guthub("uva-bi-sdad/uspto")
library(uspto)
outDir <- "../patentsview/"

Which patent categories have most and fewest female inventors?

The first step toward answering this question is to assign a sex to inventors based on their given name. The inventors table includes USPTO’s assignment, which we’ll start with:

# you may need to increase your download timeout, depending on your connection
options(timeout = 300)

inventors <- as.data.frame(download_patentsview_bulk("inventor", outDir))

Now, we can add a few of our own prediction methods, including one based on the included USPTO flags:

# add a standardized version of given names
inventors$name_first[is.na(inventors$name_first)] <- ""
inventors$given <- sub(
  "^(.)", "\\U\\1", gsub("^([a-z-]{,6}[.-])+ | +.*", "", tolower(inventors$name_first)),
  perl = TRUE
)

# associate a sex with each unique given name
## install if needed: remotes::install_github("miserman/lusilab")
library(lusilab)
inventor_sex <- predict_demographics(unique(inventors$given), dir = paste0(dirname(outDir), "/names"))
inventor_sex <- inventor_sex[!duplicated(inventor_sex$given), ]

# get categorical predictions
prob_cols <- grep("^prob_", colnames(inventor_sex), value = TRUE)
inventor_sex_preds <- inventor_sex[, prob_cols]
dimnames(inventor_sex_preds) <- list(
  inventor_sex$given,
  sub("prob", "pred", prob_cols, fixed = TRUE)
)
inventor_sex_preds[inventor_sex_preds > .5] <- 1
inventor_sex_preds[inventor_sex_preds < .5] <- 0

Before adding this to the inventors data, we can compare these different methods with the USPTO assignments:

# start with unique, assignment-processed inventors
unique_inventors <- inventors[inventors$attribution_status != 98, ]
unique_inventors <- unique_inventors[
  !duplicated(unique_inventors$id) & unique_inventors$given %in% rownames(inventor_sex_preds),
  c("given", "male_flag")
]

# reverse the male flag, and add the other predictions
unique_inventors$male_flag[is.na(unique_inventors$male_flag)] <- .5
unique_inventors$pred_fem_patentsview <- 1 - unique_inventors$male_flag
unique_inventors <- cbind(unique_inventors[, c(1, 3)], inventor_sex_preds[unique_inventors$given, ])

kable(
  data.frame(
    "Proportion Sexed" = colMeans(unique_inventors[, -1] != .5),
    "Accuracy to PatentsView" = colMeans(unique_inventors[, -1] == unique_inventors[, 2]),
    check.names = FALSE
  ),
  caption = "Sex Predition Method Comparisons"
)
Sex Predition Method Comparisons
Proportion Sexed Accuracy to PatentsView
pred_fem_patentsview 0.9243208 1.0000000
pred_fem_wgnd 0.8962996 0.8513556
pred_fem_fb 0.9001380 0.8402242
pred_fem_fb_scraped 0.7359894 0.7377783
pred_fem_skydeck 0.7991721 0.7862536
pred_fem_usssa 0.7812736 0.7743813
pred_fem_uspto 0.9944673 0.8715289

Now we can move forward with the method with most coverage:

# add assigned sex to inventors data
inventors$pred_fem <- inventor_sex_preds[inventors$given, "pred_fem_uspto"]

## focus on just sexed inventors
inventors <- inventors[!is.na(inventors$pred_fem) & inventors$pred_fem != .5, ]
rownames(inventors) <- inventors$id

# add patent IDs
patent_inventor <- download_patentsview_bulk("patent_inventor", outDir)
patent_inventor <- patent_inventor[patent_inventor$inventor_id %in% inventors$id, ]
inventors <- cbind(inventors[patent_inventor$inventor_id, -1], patent_inventor)

# count inventors predicted to be female in each patent
female_inventors <- tapply(inventors$pred_fem == 1, inventors$patent_id, sum)

World Intellectual Property Organization

Now we can add patent category information, to get breakdowns of classes by inventor sex. There are multiple classification schemes, but we can start with the World Intellectual Property Organization (WIPO) technology fields for a high-level overview:

library(Matrix)

# this makes a patent x WIPO category matrix
categories_wipo <- patentsview_class_matrix("wipo", paste0(outDir, "wipo_matrix.rds"), dir = outDir)
dim(categories_wipo)
#> [1] 7221215      35

# which we can join to the inventors matrix
inventors_wipo <- female_inventors[names(female_inventors) %in% rownames(categories_wipo)]
inventors_wipo <- cbind(as.numeric(inventors_wipo), categories_wipo[names(inventors_wipo), ])

# and get category breakdowns by inventor sex
wipo_summary <- data.frame(
  Any_Female = colSums(inventors_wipo[inventors_wipo[, 1] != 0, -1] != 0),
  No_Female = colSums(inventors_wipo[inventors_wipo[, 1] == 0, -1] != 0)
)
wipo_summary$Any_Female_Proportion <- wipo_summary$Any_Female / rowSums(wipo_summary)
wipo_summary <- wipo_summary[order(-wipo_summary$Any_Female_Proportion), ]

# add category titles
wipo_field <- as.data.frame(download_patentsview_bulk("wipo_field", outDir))
rownames(wipo_field) <- wipo_field$id
wipo_summary$Title <- wipo_field[rownames(wipo_summary), "field_title"]

kable(
  wipo_summary,
  col.names = gsub("_", " ", colnames(wipo_summary), fixed = TRUE),
  caption = "World Intellectual Property Organization Categories"
)
World Intellectual Property Organization Categories
Any Female No Female Any Female Proportion Title
15 95837 119325 0.4454179 Biotechnology
16 130366 178793 0.4216795 Pharmaceuticals
11 29377 53264 0.3554773 Analysis of biological materials
14 103048 215153 0.3238456 Organic fine chemistry
18 17566 49561 0.2616831 Food chemistry
19 63981 183795 0.2582211 Basic materials chemistry
7 42753 127448 0.2511912 IT methods for management
22 15148 47121 0.2432671 Micro-structural and nano-technology
17 47131 149718 0.2394272 Macromolecular chemistry, polymers
4 123140 459269 0.2114322 Digital communication
8 95285 385653 0.1981233 Semiconductors
6 195144 793346 0.1974163 Computer technology
13 92372 394940 0.1895541 Medical technology
21 32318 145255 0.1819984 Surface technology, coating
20 30840 142421 0.1779974 Materials, metallurgy
34 33974 157637 0.1773071 Other consumer goods
3 65270 322662 0.1682511 Telecommunications
9 74222 382155 0.1626331 Optics
23 42707 224038 0.1601042 Chemical engineering
2 84874 462135 0.1551602 Audio-visual technology
28 29643 163823 0.1532207 Textile and paper machines
12 35854 201649 0.1509623 Control
24 17007 107006 0.1371388 Environmental technology
10 69791 439170 0.1371245 Measurement
33 31347 198811 0.1361977 Furniture, games
1 83984 535169 0.1356434 Electrical machinery, apparatus, energy
29 39767 286568 0.1218594 Other special machines
5 21258 160036 0.1172571 Basic communication processes
30 11648 109139 0.0964342 Thermal processes and apparatus
32 40029 382258 0.0947910 Transport
25 21193 212107 0.0908401 Handling
27 23310 238994 0.0888664 Engines, pumps, turbines
26 18191 203697 0.0819828 Machine tools
35 20622 238441 0.0796023 Civil engineering
31 21081 275514 0.0710767 Mechanical elements

United States Patent Classification

For a more refined classification, we might look at the United States Patent Classification (USPC), which is most closely related to examination process:

# get the patent x USPC category matrix
categories_uspc <- patentsview_class_matrix(
  "uspc_current", paste0(outDir, "uspc_current_matrix.rds"),
  dir = outDir
)
dim(categories_uspc)
#> [1] 6597925     475

# join to the inventors matrix
inventors_uspc <- female_inventors[names(female_inventors) %in% rownames(categories_uspc)]
inventors_uspc <- cbind(as.numeric(inventors_uspc), categories_uspc[names(inventors_uspc), ])

# and get category breakdowns by inventor sex
uspc_summary <- data.frame(
  Any_Female = colSums(inventors_uspc[inventors_uspc[, 1] != 0, -1] != 0),
  No_Female = colSums(inventors_uspc[inventors_uspc[, 1] == 0, -1] != 0)
)
uspc_summary$Any_Female_Proportion <- uspc_summary$Any_Female / rowSums(uspc_summary)
uspc_summary <- uspc_summary[order(-uspc_summary$Any_Female_Proportion), ]

# add category titles
uspc_field <- as.data.frame(download_patentsview_bulk("mainclass_current", outDir))
rownames(uspc_field) <- uspc_field$id
uspc_summary$Title <- uspc_field[rownames(uspc_summary), "title"]

kable(
  uspc_summary[c(1:20, 1:20 + nrow(uspc_summary) - 20), ],
  col.names = gsub("_", " ", colnames(uspc_summary), fixed = TRUE),
  caption = paste(
    "U.S. Patent Classification categories with the highest and lowest",
    "proportion of any female inventor"
  )
)
U.S. Patent Classification categories with the highest and lowest proportion of any female inventor
Any Female No Female Any Female Proportion Title
450 865 619 0.5828841 FOUNDATION GARMENTS
532 9 11 0.4500000 ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES
530 22685 31356 0.4197739 CHEMISTRY: NATURAL RESINS OR DERIVATIVES; PEPTIDES OR PROTEINS; LIGNINS OR REACTION PRODUCTS THEREOF
536 23999 34275 0.4118303 ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES
D05 1040 1520 0.4062500 TEXTILE OR PAPER YARD GOODS; SHEET MATERIAL
520 55 81 0.4044118 SYNTHETIC RESINS OR NATURAL RUBBERS – PART OF THE CLASS 520 SERIES
435 50207 80953 0.3827920 CHEMISTRY: MOLECULAR BIOLOGY AND MICROBIOLOGY
424 37241 64363 0.3665308 DRUG, BIO-AFFECTING AND BODY TREATING COMPOSITIONS
514 53199 97628 0.3527154 DRUG, BIO-AFFECTING AND BODY TREATING COMPOSITIONS
544 13827 27792 0.3322281 ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES
D01 815 1657 0.3296926 EDIBLE PRODUCTS
546 14971 30560 0.3288089 ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES
506 899 1852 0.3267903 COMBINATORIAL CHEMISTRY TECHNOLOGY: METHOD, LIBRARY, APPARATUS
800 5307 11200 0.3215000 MULTICELLULAR LIVING ORGANISMS AND UNMODIFIED PARTS THEREOF AND RELATED PROCESSES
510 6014 12777 0.3200468 CLEANING COMPOSITIONS FOR SOLID SURFACES, AUXILIARY COMPOSITIONS THEREFOR, OR PROCESSES OF PREPARING THE COMPOSITIONS
548 12885 29900 0.3011569 ORGANIC COMPOUNDS – PART OF THE CLASS 532-570 SERIES
132 2091 4869 0.3004310 TOILET
D28 2973 6924 0.3003941 COSMETIC PRODUCTS AND TOILET ARTICLES
147 8 19 0.2962963 COOPERING
D02 6847 16458 0.2937996 APPAREL AND HABERDASHERY
185 18 447 0.0387097 MOTORS: SPRING, WEIGHT, OR ANIMAL POWERED
406 137 3416 0.0385590 CONVEYORS: FLUID CURRENT
245 9 227 0.0381356 WIRE FABRICS AND STRUCTURE
105 188 4765 0.0379568 RAILWAY ROLLING STOCK
305 64 1628 0.0378251 WHEEL SUBSTITUTES FOR LAND VEHICLES
254 331 8469 0.0376136 IMPLEMENTS OR APPARATUS FOR APPLYING PUSHING OR PULLING FORCE
144 234 6007 0.0374940 WOODWORKING
408 307 7944 0.0372076 CUTTING BY USE OF ROTATING AXIALLY MOVING TOOL
984 124 3226 0.0370149 MUSICAL INSTRUMENTS
226 189 5009 0.0363601 ADVANCING MATERIAL OF INDETERMINATE LENGTH
470 43 1142 0.0362869 THREADED, HEADED FASTENER, OR WASHER MAKING: PROCESS AND APPARATUS
198 865 23188 0.0359623 CONVEYORS: POWER-DRIVEN
299 158 4347 0.0350721 MINING OR IN SITU DISINTEGRATION OF HARD MATERIAL
171 18 511 0.0340265 UNEARTHING PLANTS OR BURIED OBJECTS
42 244 6945 0.0339407 FIREARMS
82 135 4062 0.0321658 TURNING
413 27 818 0.0319527 SHEET METAL CONTAINER MAKING
295 6 195 0.0298507 RAILWAY WHEELS AND AXLES
212 86 3028 0.0276172 TRAVERSING HOISTS
258 0 16 0.0000000 RAILWAY MAIL DELIVERY

How have category breakdowns changed over time?

To get at this question, we’ll need to associate a date with each patent number:

# start with all patent information
patents <- download_patentsview_bulk("patent", outDir)

# associate a date with each patent number
all_dates <- structure(patents$date, names = patents$number)

# get years, and focus on only those found in the USPC set
uspc_year <- substr(all_dates[names(all_dates) %in% rownames(inventors_uspc)], 1, 4)

# use this set of years to break down the overall summaries
uspc_yearly_summaries <- do.call(rbind, lapply(sort(unique(uspc_year)), function(year) {
  d <- inventors_uspc[names(which(uspc_year == year)), ]
  r <- data.frame(
    Any_Female = colSums(d[d[, 1] != 0, -1] != 0, na.rm = TRUE),
    No_Female = colSums(d[d[, 1] == 0, -1] != 0, na.rm = TRUE)
  )
  structure(c(as.numeric(year), r$Any_Female / rowSums(r)), names = c("Year", rownames(r)))
}))
uspc_yearly_summaries[is.na(uspc_yearly_summaries)] <- 0
uspc_yearly_summaries <- uspc_yearly_summaries[, colSums(uspc_yearly_summaries) != 0]

# plot categories with the most positive and negative trends
library(splot)
trends <- sort(cor(uspc_yearly_summaries[, -1], uspc_yearly_summaries[, 1])[, 1], TRUE)
splot(
  uspc_yearly_summaries[, names(trends[c(1:6, 1:3 + length(trends) - 3)])] ~ uspc_yearly_summaries[, 1],
  lines = "spline", levels = list(mv = uspc_field[names(trends[c(1:6, 1:3 + length(trends) - 3)]), "title"]),
  title = "Proportion of Female-Assigned Inventors over Time",
  laby = "Proportion of Patents With Any Female-Assigned Inventor",
  labx = "Year", myl = c(0, .4), leg.title = "U.S. Patent Classification"
)

Cooperative Patent Classifications

Cooperative Patent Classifications (CPC) might give an even more refined look, and could be compared across patent offices:

# get the patent x CPC category matrix
categories_cpc <- patentsview_class_matrix(
  "cpc_current", paste0(outDir, "cpc_current_matrix.rds"),
  dir = outDir
)
dim(categories_cpc)
#> [1] 7228678     670

# join to the inventors matrix
inventors_cpc <- female_inventors[names(female_inventors) %in% rownames(categories_cpc)]
inventors_cpc <- cbind(as.numeric(inventors_cpc), categories_cpc[names(inventors_cpc), ])

# and get category breakdowns by inventor sex
cpc_summary <- data.frame(
  Any_Female = colSums(inventors_cpc[inventors_cpc[, 1] != 0, -1] != 0),
  No_Female = colSums(inventors_cpc[inventors_cpc[, 1] == 0, -1] != 0)
)
cpc_summary$Any_Female_Proportion <- cpc_summary$Any_Female / rowSums(cpc_summary)
cpc_summary <- cpc_summary[order(-cpc_summary$Any_Female_Proportion), ]

# add category titles
cpc_group <- as.data.frame(download_patentsview_bulk("cpc_group", outDir))
rownames(cpc_group) <- cpc_group$id
cpc_summary <- cpc_summary[rownames(cpc_summary) %in% cpc_group$id, ]
cpc_summary$Title <- cpc_group[rownames(cpc_summary), "title"]

cpc_selection <- names(which(rowSums(cpc_summary[, 1:2]) > 5))
cpc_selection <- cpc_selection[c(1:20, 1:20 + length(cpc_selection) - 20)]
kable(
  cpc_summary[cpc_selection, ],
  col.names = gsub("_", " ", colnames(cpc_summary), fixed = TRUE),
  caption = paste(
    "Cooperative Patent Classifications: 20 categories with the highest and lowest",
    "proportion of any female inventor with at least 5 associated patents"
  )
)
Cooperative Patent Classifications: 20 categories with the highest and lowest proportion of any female inventor with at least 5 associated patents
Any Female No Female Any Female Proportion Title
A41C 1080 624 0.6338028 CORSETS; BRASSIERES
C12Y 9228 8665 0.5157324 ENZYMES
A23Y 407 384 0.5145386 INDEXING SCHEME RELATING TO LACTIC OR PROPIONIC ACID BACTERIA USED IN FOODSTUFFS OR FOOD PREPARATION
B42P 449 449 0.5000000 INDEXING SCHEME RELATING TO BOOKS, FILING APPLIANCES OR THE LIKE
C07K 50697 55282 0.4783684 PEPTIDES
C12N 49212 55853 0.4683958 MICROORGANISMS OR ENZYMES; COMPOSITIONS THEREOF; PROPAGATING, PRESERVING, OR MAINTAINING MICROORGANISMS; MUTATION OR GENETIC ENGINEERING; CULTURE MEDIA
A61P 97848 118156 0.4529916 SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMPOUNDS OR MEDICINAL PREPARATIONS
C12R 2959 3605 0.4507922 INDEXING SCHEME ASSOCIATED WITH SUBCLASSES C12C - C12Q, RELATING TO MICROORGANISMS
C12P 11386 14667 0.4370322 FERMENTATION OR ENZYME-USING PROCESSES TO SYNTHESISE A DESIRED CHEMICAL COMPOUND OR COMPOSITION OR TO SEPARATE OPTICAL ISOMERS FROM A RACEMIC MIXTURE
A41B 1246 1610 0.4362745 SHIRTS; UNDERWEAR; BABY LINEN; HANDKERCHIEFS
A61Q 15280 20272 0.4297930 SPECIFIC USE OF COSMETICS OR SIMILAR TOILET PREPARATIONS
C12Q 21156 28804 0.4234588 MEASURING OR TESTING PROCESSES INVOLVING ENZYMES, NUCLEIC ACIDS OR MICROORGANISMS ; COMPOSITIONS OR TEST PAPERS THEREFOR; PROCESSES OF PREPARING SUCH COMPOSITIONS; CONDITION-RESPONSIVE CONTROL IN MICROBIOLOGICAL OR ENZYMOLOGICAL PROCESSES
A61K 109600 152294 0.4184899 PREPARATIONS FOR MEDICAL, DENTAL, OR TOILET PURPOSES
A23V 4069 5723 0.4155433 INDEXING SCHEME RELATING TO FOODS, FOODSTUFFS OR NON-ALCOHOLIC BEVERAGES
A01P 10 15 0.4000000 BIOCIDAL, PEST REPELLANT, PEST ATTRACTANT OR PLANT GROWTH REGULATORY ACTIVITY OF CHEMICAL COMPOUNDS OR PREPARATIONS
A47D 1661 2571 0.3924858 FURNITURE SPECIALLY ADAPTED FOR CHILDREN
G16B 2223 3473 0.3902739 BIOINFORMATICS, i.e. INFORMATION AND COMMUNICATION TECHNOLOGY [ICT] SPECIALLY ADAPTED FOR GENETIC OR PROTEIN-RELATED DATA PROCESSING IN COMPUTATIONAL MOLECULAR BIOLOGY
C07B 4106 6922 0.3723250 GENERAL METHODS OF ORGANIC CHEMISTRY; APPARATUS THEREFOR
A21D 1377 2357 0.3687734 TREATMENT, e.g. PRESERVATION, OF FLOUR OR DOUGH, e.g. BY ADDITION OF MATERIALS; BAKING; BAKERY PRODUCTS; PRESERVATION THEREOF
C11D 8140 14651 0.3571585 DETERGENT COMPOSITIONS ; USE OF SINGLE SUBSTANCES AS DETERGENTS; SOAP OR SOAP-MAKING; RESIN SOAPS; RECOVERY OF GLYCEROL
B66C 343 7818 0.0420292 CRANES; LOAD-ENGAGING ELEMENTS OR DEVICES FOR CRANES, CAPSTANS, WINCHES, OR TACKLES
B27B 193 4508 0.0410551 SAWS FOR WOOD OR SIMILAR MATERIAL; COMPONENTS OR ACCESSORIES THEREFOR
A01F 173 4077 0.0407059 PROCESSING OF HARVESTED PRODUCE; HAY OR STRAW PRESSES; DEVICES FOR STORING AGRICULTURAL OR HORTICULTURAL PRODUCE
B61J 6 143 0.0402685 SHIFTING OR SHUNTING OF RAIL VEHICLES
D03C 36 871 0.0396913 SHEDDING MECHANISMS; PATTERN CARDS OR CHAINS; PUNCHING OF CARDS; DESIGNING PATTERNS
F41A 461 11186 0.0395810 FUNCTIONAL FEATURES OR DETAILS COMMON TO BOTH SMALLARMS AND ORDNANCE, e.g. CANNONS; MOUNTINGS FOR SMALLARMS OR ORDNANCE
G10B 3 75 0.0384615 ORGANS, HARMONIUMS OR SIMILAR WIND MUSICAL INSTRUMENTS WITH ASSOCIATED BLOWING APPARATUS
E01B 129 3255 0.0381206 PERMANENT WAY; PERMANENT-WAY TOOLS; MACHINES FOR MAKING RAILWAYS OF ALL KINDS
D03J 18 481 0.0360721 AUXILIARY WEAVING APPARATUS; WEAVERS’ TOOLS; SHUTTLES
D01H 131 3665 0.0345100 SPINNING OR TWISTING
B62C 2 61 0.0317460 VEHICLES DRAWN BY ANIMALS
B27F 41 1322 0.0300807 DOVETAILED WORK; TENONS; SLOTTING MACHINES FOR WOOD OR SIMILAR MATERIAL; NAILING OR STAPLING MACHINES
B21G 10 326 0.0297619 MAKING NEEDLES, PINS OR NAILS OF METAL
B27L 45 1481 0.0294889 REMOVING BARK OR VESTIGES OF BRANCHES ; SPLITTING WOOD; MANUFACTURE OF VENEER, WOODEN STICKS, WOOD SHAVINGS, WOOD FIBRES OR WOOD POWDER
F16T 12 400 0.0291262 STEAM TRAPS OR LIKE APPARATUS FOR DRAINING-OFF LIQUIDS FROM ENCLOSURES PREDOMINANTLY CONTAINING GASES OR VAPOURS
D02H 7 254 0.0268199 WARPING, BEAMING OR LEASING
F27M 8 295 0.0264026 INDEXING SCHEME RELATING TO ASPECTS OF THE CHARGES OR FURNACES, KILNS, OVENS OR RETORTS
B41G 1 52 0.0188679 APPARATUS FOR BRONZE PRINTING, LINE PRINTING, OR FOR BORDERING OR EDGING SHEETS OR LIKE ARTICLES; AUXILIARY FOR PERFORATING IN CONJUNCTION WITH PRINTING
G06D 0 9 0.0000000 DIGITAL FLUID-PRESSURE COMPUTING DEVICES
G21J 0 20 0.0000000 NUCLEAR EXPLOSIVES; APPLICATIONS THEREOF

Where are there most and fewest female inventors?

The locations table associates inventor IDs with location IDs:

locations <- as.data.frame(download_patentsview_bulk("location", outDir))
locations$state_fips[!is.na(locations$state_fips)] <- formatC(
  locations$state_fips[!is.na(locations$state_fips)],
  width = 2, flag = 0, format = "d"
)
locations$county_fips[!is.na(locations$county_fips)] <- formatC(
  locations$county_fips[!is.na(locations$county_fips)],
  width = 5, flag = 0, format = "d"
)

# we can align this with our inventors data for inventor sex
rownames(locations) <- locations$id
located_inventors <- inventors[!is.na(inventors$location_id), ]
locations <- locations[located_inventors$location_id, ]

Inventor location is recorded as part of each patent, which means inventors may have multiple locations over time. For an initial look, we can focus only on each inventors most recent location:

# add date information to inventor data
inventors$date <- structure(patents$date, names = patents$number)[inventors$patent_id]

# sort inventors by date, then add location information
inventors_last_seen <- inventors[order(inventors$date, decreasing = TRUE), ]
inventors_last_seen <- inventors_last_seen[!duplicated(inventors_last_seen$inventor_id), ]
inventors_last_seen <- inventors_last_seen[inventors_last_seen$location_id %in% locations$id, ]
inventors_last_seen <- cbind(inventors_last_seen, locations[inventors_last_seen$location_id, ])

Now we can look at high-level summaries of locations, like we did with patent classes:

# top countries
breakdown_countries <- as.data.frame(t(vapply(
  split(inventors_last_seen$pred_fem, inventors_last_seen$country),
  function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
  c(0, 0)
)))
breakdown_countries$Proportion_Female <- breakdown_countries$Female / rowSums(breakdown_countries)
breakdown_countries <- breakdown_countries[order(-breakdown_countries$Proportion_Female), ]
kable(
  breakdown_countries[rowSums(breakdown_countries[, 1:2]) > 1e4, ],
  col.names = gsub("_", " ", colnames(breakdown_countries), fixed = TRUE),
  caption = "Countries with at least 10,000 associated inventors"
)
Countries with at least 10,000 associated inventors
Female Male Proportion Female
CN 30697 111127 0.2164443
SU 2868 12402 0.1878193
ES 4204 18262 0.1871272
SG 2121 11255 0.1585676
KR 22265 123451 0.1527972
IL 5889 34888 0.1444196
DK 2318 14376 0.1388523
IN 7962 49936 0.1375177
FR 17124 113171 0.1314248
US 248082 1676877 0.1288765
RU 1767 11997 0.1283784
BE 2662 18493 0.1258331
IT 6479 47636 0.1197265
FI 2414 18484 0.1155134
AU 3433 26615 0.1142505
CA 11256 94016 0.1069230
SE 4272 36451 0.1049039
GB 11575 100464 0.1033122
JP 47228 469031 0.0914812
CH 3774 38745 0.0887603
NO 950 10112 0.0858796
NL 4008 43075 0.0851263
DE 22507 269343 0.0771184
AT 1392 17985 0.0718377
TW 7228 96518 0.0696702
# top states
breakdown_states <- as.data.frame(t(vapply(
  split(inventors_last_seen$pred_fem, inventors_last_seen$state_fips),
  function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
  c(0, 0)
)))
breakdown_states$Proportion_Female <- breakdown_states$Female / rowSums(breakdown_states)
breakdown_states <- breakdown_states[order(-breakdown_states$Proportion_Female), ]
## install if needed: remotes::install_github("uva-bi-sdad/catchment")
library(catchment)
states <- download_census_shapes(paste0(dirname(outDir), "/maps"))
state_names <- structure(states$NAME, names = states$STATEFP)
breakdown_states <- breakdown_states[rownames(breakdown_states) %in% names(state_names), ]
rownames(breakdown_states) <- state_names[rownames(breakdown_states)]
kable(
  breakdown_states[rowSums(breakdown_states[, 1:2]) > 1e4, ],
  col.names = gsub("_", " ", colnames(breakdown_states), fixed = TRUE),
  caption = "States with at least 10,000 associated inventors"
)
States with at least 10,000 associated inventors
Female Male Proportion Female
Maryland 5523 29584 0.1573191
New Jersey 10743 59426 0.1531018
New York 17472 98586 0.1505454
Georgia 5209 29964 0.1480966
Massachusetts 12748 74697 0.1457831
North Carolina 6312 39518 0.1377264
Missouri 3234 20380 0.1369527
Florida 9040 57314 0.1362390
California 53816 342693 0.1357245
Virginia 4600 29469 0.1350201
Minnesota 6730 45965 0.1277161
Illinois 10649 74316 0.1253340
Colorado 5045 35367 0.1248392
Washington 9572 67880 0.1235862
Pennsylvania 9181 66069 0.1220066
Arizona 4056 29204 0.1219483
Tennessee 2435 17556 0.1218048
Oregon 3635 26252 0.1216248
South Carolina 1856 13823 0.1183749
Ohio 8706 65003 0.1181131
Kansas 1409 10543 0.1178882
Louisiana 1243 9388 0.1169222
Texas 14560 110612 0.1163199
Kentucky 1343 10214 0.1162066
Connecticut 4106 31505 0.1153015
Alabama 1254 9627 0.1152468
Indiana 3864 30089 0.1138044
Wisconsin 4529 35678 0.1126421
Michigan 8819 77834 0.1017737
Utah 1927 17285 0.1003019
Oklahoma 1231 11331 0.0979940
Iowa 1484 13770 0.0972860
New Hampshire 1216 11505 0.0955900
# map of counties
library(leaflet)
library(sf)
counties <- st_transform(download_census_shapes(
  paste0(dirname(outDir), "/maps"),
  entity = "county"
), "WGS84")
counties$NAME <- paste0(counties$NAME, ", ", state_names[counties$STATEFP])

breakdown_counties <- as.data.frame(t(vapply(
  split(inventors_last_seen$pred_fem, inventors_last_seen$county_fips),
  function(d) c(Female = sum(d == 1), Male = sum(d == 0)),
  c(0, 0)
)))
breakdown_counties$Total <- rowSums(breakdown_counties)
breakdown_counties$Proportion_Female <- breakdown_counties$Female / breakdown_counties$Total
breakdown_counties$Capped_Total <- breakdown_counties$Total
breakdown_counties$Capped_Total[breakdown_counties$Capped_Total > 5e4] <- 1e5
breakdown_counties <- breakdown_counties[counties$GEOID, ]

pal <- colorNumeric(
  scico::scico(255, direction = -1, palette = "vik"), breakdown_counties$Proportion_Female
)
pal_total <- colorNumeric(
  scico::scico(255, direction = -1, palette = "lajolla"), breakdown_counties$Capped_Total
)
leaflet(counties, options = leafletOptions(attributionControl = FALSE)) |>
  addProviderTiles("CartoDB.Positron") |>
  setView(-95.5810546875, 39.5040407055842, 4) |>
  addControl("Proportion of Patents with Any Inventor Assigned Female", "topright") |>
  addLayersControl(position = "topleft", overlayGroups = c("Total", "Proportion")) |>
  addLegend(
    "bottomright", pal_total, breakdown_counties$Capped_Total,
    opacity = 1,
    title = "Totals", group = "Total"
  ) |>
  addPolygons(
    fillColor = pal_total(breakdown_counties$Capped_Total),
    fillOpacity = .8, weight = 1, color = "#000", highlightOptions = highlightOptions(color = "#fff"),
    group = "Total", label = paste0("County: ", counties$NAME, "; Total: ", breakdown_counties$Total)
  ) |>
  hideGroup("Total") |>
  addLegend(
    "bottomright", pal, breakdown_counties$Proportion_Female,
    opacity = 1,
    title = "Proportions", group = "Proportion"
  ) |>
  addPolygons(
    fillColor = pal(breakdown_counties$Proportion_Female), fillOpacity = .8, weight = 1, color = "#000",
    highlightOptions = highlightOptions(color = "#fff"),
    group = "Proportion", label = paste0(
      "County: ", counties$NAME,
      "; Female: ", breakdown_counties$Female,
      "; Male: ", breakdown_counties$Male,
      "; Proportion Female: ", round(breakdown_counties$Proportion_Female, 3)
    )
  )

How have inventors migrated?

Instead of collapsing inventors to a single location, we could also look for instances of inventors moving between states, and see which moves are most common:

# start with all of located inventors
inventor_states <- cbind(
  located_inventors[, c("inventor_id", "pred_fem")],
  date = all_dates[located_inventors$patent_id],
  state = state_names[locations$state_fips]
)
inventor_states <- inventor_states[!is.na(inventor_states$state), ]
inventor_states <- inventor_states[!is.na(inventor_states$date), ]

# select only those that are associated with multiple patents
inventor_states <- inventor_states[
  inventor_states$inventor_id %in% inventor_states$inventor_id[duplicated(inventor_states$inventor_id)],
]
inventor_states <- inventor_states[order(inventor_states$date, decreasing = TRUE), ]

# then look at each inventor to see if they have more than one state.
# if they do, record their transition; previous state -> new state
inventor_history <- lapply(split(inventor_states, inventor_states$inventor_id), function(d) {
  if (!all(d$state == d$state[[1]])) {
    res <- NULL
    states <- d$state
    for (i in seq_len(nrow(d) - 1)) {
      if (states[[i]] != states[[i + 1]]) res <- c(paste(states[[i + 1]], "->", states[[i]]), res)
    }
    res
  }
})
inventor_moves <- unlist(inventor_history)
inventor_moves <- tapply(inventor_moves, inventor_moves, length)
inventor_transitions <- data.frame(
  do.call(rbind, strsplit(names(inventor_moves), " -> ", fixed = TRUE)),
  inventor_moves
)
colnames(inventor_transitions) <- c("from", "to", "count")

# use these transitions to make a origin x destination matrix
states <- sort(unique(c(inventor_transitions$from, inventor_transitions$to)))
migrations <- matrix(0, length(states), length(states), dimnames = list(states, states))
for (r in seq_len(nrow(inventor_transitions))) {
  move <- inventor_transitions[r, ]
  migrations[move$from, move$to] <- move$count
}
migrations <- migrations[rowSums(migrations) > 1e4, rowSums(migrations) > 1e4]
migrations <- migrations[order(-rowSums(migrations)), order(-rowSums(migrations))]

# and make a chord diagram out of the most frequently involved states
## devtools::install_github("mattflor/chorddiag")
library(chorddiag)
chorddiag(
  migrations,
  groupColors = scico::scico(nrow(migrations), palette = "vik")[
    order(breakdown_states[rownames(migrations), "Proportion_Female"])
  ], showTicks = FALSE, groupnamePadding = 5
)